rm(list=ls())
knitr::opts_knit$set(root.dir = '~/OneDrive - University College London/Projects/Ostridge_PanAf/baypass/analysing_baypass_output/baypass_aux/')
subsps=c('all','ce', 'w')
env_data='f_over_sum_known_trees'
habitat_col='black'
col_scale=colorRampPalette(c('goldenrod1', 'khaki4', 'darkgreen'))(100)
#fprs=c(0.005, 0.001, 0.0005, 0.0001, 0.00005)
fprs=c(0.005, 0.001, 0.0005)
fprs.pct=paste0(fprs*100,"pct")
stats=c("raw_AF", "M_P")
FPRs were assigned in ./baypass_aux/scripts/f_over_sum_known_trees/f_over_sum_known_trees.baypass_aux_candidate.Rmd. Here I investigate the allele frequency patterns at candidate SNPs to verify that BayPass is accounting for population structure and identifing SNPs correlated with habitat. I also want to gain insights into the nature of these signals, e.g. the magnitude of allele frequency differences.
library(data.table)
options(datatable.fread.datatable=FALSE)
library(dplyr)
library(tidyverse)
source("../scripts/baypass_tools.R")
source("scripts/baypass_aux_tools.R")
source("../baypass_core/scripts/distribution_of_candidates_in_the_genome_tools-cov_cor.R")
source("../baypass_core/scripts/candidate_allele_frequency_patterns.R")
env_dir="../../../environmental_data/output/"
formatted_aux_out_dir="output/formatted_baypass_aux_output/"
formatted_aux_out_fprs_dir="output/baypass_aux_output_with_fprs/"
exome_ac_dir="../../../allele_frequencies/exome/output/"
## - all
## - ce
## - w
max_n=1000
cor.res=data.frame(Subspecies=character(), FPR=character(), Direction=character(), Stat=character(), r=numeric(), p=numeric())
cor_test=function(df, stat){
data.frame(cor.test(df[[stat]], df$forest_tree_pct, method="pearson")[c("estimate","p.value")])
}
for(subsp in subsps[2]){
### Add covs to AF data frame
cov.in.format=data.frame(Population=names(cov.in[[subsp]][['Real']]), forest_tree_pct=unlist(cov.in[[subsp]][['Real']]))
for(fpr in c(fprs)){
for(dir in c('Positive', 'Negative')){
set.seed(123)
if(dir=='Positive'){betai_cand.tmp=betai_fpr[[subsp]][betai_fpr[[subsp]]$fpr<fpr & betai_fpr[[subsp]]$M_Beta.median>=0,]}
if(dir=='Negative'){betai_cand.tmp=betai_fpr[[subsp]][betai_fpr[[subsp]]$fpr<fpr & betai_fpr[[subsp]]$M_Beta.median<=0,]}
if(nrow(betai_cand.tmp)>max_n){betai_cand.tmp=betai_cand.tmp[sample(nrow(betai_cand.tmp), max_n),]}
allele.freq.tmp=allele.freq.cand[[subsp]][allele.freq.cand[[subsp]]$chr_pos %in% betai_cand.tmp$chr_pos,]
allele.freq.tmp=merge(allele.freq.tmp, cov.in.format, by="Population")
for(stat in stats){
cor.res.tmp=allele.freq.tmp %>%
group_by(chr_pos) %>% do(cor_test(., stat=stat))
cor.res=rbind(cor.res, data.frame(Subspecies=subsp_name[[subsp]], FPR=paste0(100*fpr,"%"), Direction=dir, Stat=stat, r=cor.res.tmp$estimate, p=cor.res.tmp$p.value))
}
}
}
}
cor.res$FPR=factor(cor.res$FPR, levels=rev(unique(cor.res$FPR)[order(unique(cor.res$FPR))]))
stat_names=c("M_P"="Unstandardised allele frequencies", "M_Pstd"="Standardised allele frequencies", "raw_AF"="Raw allele frequencies")
cor_stat_names=c('r'="Pearson's r", 'p'="Pearson's r p-value")
for(stat in stats){
for(cor_stat in c('r', 'p')){
plot=ggplot(cor.res[cor.res$Stat==stat,], aes_string(x='FPR', y=cor_stat, fill='Direction', col='Direction'))+
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
#plot.subtitle = element_text(hjust = 0.5),
legend.position = "none",
strip.text = element_text(face = "italic"),
strip.background =element_rect(fill="white")) +
labs(title=stat_names[[stat]],
#subtitle=stat_names[[stat]],
x="FPR tail",
y=cor_stat_names[[cor_stat]]) +
geom_boxplot(outlier.shape = 1) +
facet_grid(. ~Subspecies, space='free') +
scale_discrete_manual(aesthetics = c("fill"), values = c('Positive' = 'deepskyblue', 'Negative'='red')) +
scale_discrete_manual(aesthetics = c("colour"), values = c('Positive' = 'deepskyblue4', 'Negative'='darkred'))
if(cor_stat=="r"){
plot=plot+
geom_hline(yintercept=0)+
ylim(-1,1)
}
if(cor_stat=="p"){
plot=plot+
ylim(0,1)
}
print(plot)
}
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.